Collate larval size data

Author

Max Lindmark

Published

2024-03-28

# Load libraries
library(tidyverse)
library(tidyterra)
library(tidylog)
library(tidync)
library(ggridges)
library(readxl)
library(janitor)
library(lubridate)
library(sdmTMB)
library(ncdf4)
library(patchwork)
library(terra)
library(viridis)
library(devtools)
library(ggsidekick); theme_set(theme_sleek())

home <- here::here()
# source_url("https://raw.githubusercontent.com/maxlindmark/cod-interactions/main/R/functions/map-plot.R")
source(paste0(home, "/R/functions/map-plot.R"))

Explore data

Read and clean data

Old data

# 1992 -2008
length_old <- read_excel(paste0(home, "/data/larvea/1992_2010 MIK SWE Alla arter.xlsx"),
                       sheet = 1,
                       skip = 20)

# Clean data!
length_old <- length_old %>% 
  # some columns are on row 19, most are on 20, so lets fix this manually
  # we take lat from trawl data
  # rename(lat = `lat degree...10`, 
  #        lon = `long degree...16`) %>% 
  # clean names so that I can pivot wider
  clean_names() %>% 
  pivot_longer(cols = x3:x130, names_to = "length_mm", values_to = "n") %>% 
  dplyr::select(year, day, month, haul, species, n, length_mm) %>% 
  # make it numeric
  mutate(length_mm = as.numeric(str_remove(length_mm, "x"))) %>% 
  # NA n means there was no recorded larvae in that size group (note there are also 0s in the data)
  drop_na(n) %>%
  filter(n > 0) %>% 
  # expand the data so that 1 row is 1 individual
  # FIXME: the counts are not integers?! (hence the need for round)
  uncount(round(n)) %>% 
  dplyr::select(-n) %>% 
  # make a unique haul ID so that we can match with trawl data
  mutate(haul_id = paste(year, month, day, haul, sep = "_"))

# Read trawl data and match in coordinates
trawl_old <- read_excel(paste0(home, "/data/larvea/1992-2010 MIK SWE Tråldata.xlsx"),
                        sheet = 1,
                        skip = 8) %>% 
  clean_names() %>% 
  # the last two coordinate columns are decimal degrees of haul position
  rename(haul = haul_no,
         lat = lat_decim_20,
         lon = long_decim_21) %>% 
  # two rows without info, including year, so I'm dropping these
  drop_na(year) %>% 
  mutate(haul_id = paste(year, month, day, haul, sep = "_")) %>% 
  distinct(haul_id, .keep_all = TRUE) %>% 
  dplyr::select(haul_id, lat, lon, temp)
  
# Join trawl data to length data
length_old <- length_old %>%
  left_join(trawl_old, by = "haul_id") %>%
  dplyr::select(-haul) %>% 
  mutate(period = "old",
         day = as.numeric(day),
         month = as.numeric(month))

New data

# 1992 -2008
length_new <- read_excel(paste0(home, "/data/larvea/ELDB(s) bara fisk 2008-2024.xlsx")) %>% 
  clean_names() %>% 
  rename(length_mm = length) %>% 
  dplyr::select(haul_id, species, length_mm) %>% 
  drop_na(length_mm)

trawl_new <- read_excel(paste0(home, "/data/larvea/ELDB 2008-2024.xlsx")) %>% 
  clean_names() %>% 
  rename(lat = start_latitud,
         lon = start_longitud) %>% 
  dplyr::select(year, day, month, haul_id, lat, lon, sur_temp) %>% 
  rename(temp = sur_temp)

length_new <- length_new %>% 
  left_join(trawl_new, by = "haul_id") %>%
  mutate(period = "old",
         year = as.numeric(year),
         day = as.numeric(day),
         month = as.numeric(month))

Join old and new

d <- bind_rows(length_new, length_old) %>% 
  mutate(yday = yday(paste(year, month, day, sep = "-"))) %>% 
  filter(lon > 8) %>% 
  drop_na(lat) %>% 
  rename(temp_obs = temp)

# Add km UTM coords
d <- d %>%
  add_utm_columns(ll_names = c("lon", "lat"))

Explore data

# Sample size
plot_map_fc + 
  geom_point(data = d %>% 
               group_by(haul_id, Y, X, year) %>% 
               summarise(n = n()),
             aes(X*1000, Y*1000, color = n),
             size = 0.5) + 
  facet_wrap(~year, ncol = 8) + 
  scale_color_viridis(trans = "sqrt") + 
  ggtitle("Sample size per haul")

# Day of the year
plot_map_fc + 
  geom_point(data = d %>% 
               distinct(haul_id, .keep_all = TRUE),
             aes(X*1000, Y*1000, color = yday),
             size = 0.5) + 
  facet_wrap(~year, ncol = 8) + 
  scale_color_viridis(trans = "sqrt") + 
  ggtitle("Day of the year of sampling in space")

# Which dates are sampled?
d %>% 
  ggplot(aes(as.factor(month))) + 
  scale_fill_viridis(discrete = TRUE) +
  geom_histogram(stat= "count")

d %>% 
  ggplot(aes(x = yday, y = as.factor(year), fill = after_stat(x))) + 
  scale_fill_viridis(alpha = 0.8, name = "") +
  geom_density_ridges_gradient(alpha = 0.75) + 
  theme_facet_map() + 
  labs(y = "year")

# Species
sort(unique(d$species))
  [1] "aequorea"                     "agonus cataphractus"         
  [3] "Agonus cataphractus"          "alloteuthis subulata"        
  [5] "ammodytes marinus"            "ammodytidae"                 
  [7] "Ammodytidae"                  "anarhichas lupus"            
  [9] "Anarhichas lupus"             "anguilla anguilla"           
 [11] "Anguilla anguilla"            "aphia minuta"                
 [13] "Aphia minuta"                 "argentina silus"             
 [15] "Argentina silus"              "argentina sphyraena"         
 [17] "argentina spp"                "Argentina spp"               
 [19] "argentinidae"                 "arnoglossus laterna"         
 [21] "Arnoglossus laterna"          "branchiostoma lanceolatum"   
 [23] "Branchiostoma lanceolatum"    "buglossidium luteum"         
 [25] "Buglossidium luteum"          "callionymidae"               
 [27] "Callionymidae"                "callionymus lyra"            
 [29] "Callionymus lyra"             "callionymus maculatus"       
 [31] "Callionymus maculatus"        "callionymus reticulatus"     
 [33] "chirolophis  ascanii"         "Chirolophis  ascanii"        
 [35] "clupea harengus"              "Clupea harengus"             
 [37] "CLUPEA HARENGUS ADULT"        "clupeidae"                   
 [39] "Clupeidae"                    "cottidae"                    
 [41] "Cottus spp"                   "crystallogobius linearis"    
 [43] "Crystallogobius linearis"     "Cyclopterus lumpus"          
 [45] "echiodon drummondi"           "enchelyopus cimbrius"        
 [47] "Enchelyopus cimbrius"         "Engraulis encrasicholus"     
 [49] "engraulis encrasicolus"       "entelurus aequoreus"         
 [51] "Entelurus aequoreus"          "eutrigla gurnardus"          
 [53] "Eutrigla gurnardus"           "Gadidae"                     
 [55] "gadus morhua"                 "Gadus morhua"                
 [57] "gaidropsarus argentatus"      "gasterosteus aculeatus"      
 [59] "Gasterosteus aculeatus"       "glyptocephalus cynoglossus"  
 [61] "Glyptocephalus cynoglossus"   "Gobiidae"                    
 [63] "hippoglossoides platessoides" "Hippoglossoides platessoides"
 [65] "Hyperoplus lanceolatus"       "illex"                       
 [67] "lebetus scorpioides"          "Lebetus scorpioides"         
 [69] "limanda limanda"              "Limanda limanda"             
 [71] "liparis montagui"             "Liparis montagui"            
 [73] "loligo"                       "Lumpenus lampretaeformis"    
 [75] "maurolicus muelleri"          "Maurolicus muelleri"         
 [77] "merlangius merlangus"         "Merlangius merlangus"        
 [79] "merluccius merluccius"        "Merluccius merluccius"       
 [81] "microstomus kitt"             "Microstomus kitt"            
 [83] "molva molva"                  "Mugilidae"                   
 [85] "myoxocephalus scorpioides"    "myoxocephalus scorpius"      
 [87] "Myoxocephalus scorpius"       "Nerophis lumbriciformis"     
 [89] "Osmerus eperlanus"            "pholis gunnellus"            
 [91] "Pholis gunnellus"             "phrynorhombus norvegicus"    
 [93] "Phrynorhombus norvegicus"     "phycidae"                    
 [95] "Phycidae"                     "physidae"                    
 [97] "pleuronectes platessa"        "Pleuronectes platessa"       
 [99] "pleuronectidae"               "Pleuronectidae"              
[101] "Pleuronectiformes"            "Pollachius pollachius"       
[103] "pomatoschistus sp"            "Pomatoschistus sp"           
[105] "sardina pilchardus"           "sepiola atlantica"           
[107] "solea solea"                  "Solea solea"                 
[109] "sprattus sprattus"            "Sprattus sprattus"           
[111] "SPRATTUS SPRATTUS ADULT"      "syngnathus acus"             
[113] "Syngnathus acus"              "syngnathus rostellatus"      
[115] "Syngnathus rostellatus"       "Syngnathus spp"              
[117] "syngnathus typhle"            "Syngnathus typhle"           
[119] "taurulus bubalis"             "Taurulus bubalis"            
[121] "Trachurus trachurus"          "Triglidae"                   
[123] "Trisopterus esmarkii"         "UNIDENTIFIED"                
# Clean species names!
d <- d %>% 
  mutate(species = str_to_sentence(species))

sort(unique(d$species))
 [1] "Aequorea"                     "Agonus cataphractus"         
 [3] "Alloteuthis subulata"         "Ammodytes marinus"           
 [5] "Ammodytidae"                  "Anarhichas lupus"            
 [7] "Anguilla anguilla"            "Aphia minuta"                
 [9] "Argentina silus"              "Argentina sphyraena"         
[11] "Argentina spp"                "Argentinidae"                
[13] "Arnoglossus laterna"          "Branchiostoma lanceolatum"   
[15] "Buglossidium luteum"          "Callionymidae"               
[17] "Callionymus lyra"             "Callionymus maculatus"       
[19] "Callionymus reticulatus"      "Chirolophis  ascanii"        
[21] "Clupea harengus"              "Clupea harengus adult"       
[23] "Clupeidae"                    "Cottidae"                    
[25] "Cottus spp"                   "Crystallogobius linearis"    
[27] "Cyclopterus lumpus"           "Echiodon drummondi"          
[29] "Enchelyopus cimbrius"         "Engraulis encrasicholus"     
[31] "Engraulis encrasicolus"       "Entelurus aequoreus"         
[33] "Eutrigla gurnardus"           "Gadidae"                     
[35] "Gadus morhua"                 "Gaidropsarus argentatus"     
[37] "Gasterosteus aculeatus"       "Glyptocephalus cynoglossus"  
[39] "Gobiidae"                     "Hippoglossoides platessoides"
[41] "Hyperoplus lanceolatus"       "Illex"                       
[43] "Lebetus scorpioides"          "Limanda limanda"             
[45] "Liparis montagui"             "Loligo"                      
[47] "Lumpenus lampretaeformis"     "Maurolicus muelleri"         
[49] "Merlangius merlangus"         "Merluccius merluccius"       
[51] "Microstomus kitt"             "Molva molva"                 
[53] "Mugilidae"                    "Myoxocephalus scorpioides"   
[55] "Myoxocephalus scorpius"       "Nerophis lumbriciformis"     
[57] "Osmerus eperlanus"            "Pholis gunnellus"            
[59] "Phrynorhombus norvegicus"     "Phycidae"                    
[61] "Physidae"                     "Pleuronectes platessa"       
[63] "Pleuronectidae"               "Pleuronectiformes"           
[65] "Pollachius pollachius"        "Pomatoschistus sp"           
[67] "Sardina pilchardus"           "Sepiola atlantica"           
[69] "Solea solea"                  "Sprattus sprattus"           
[71] "Sprattus sprattus adult"      "Syngnathus acus"             
[73] "Syngnathus rostellatus"       "Syngnathus spp"              
[75] "Syngnathus typhle"            "Taurulus bubalis"            
[77] "Trachurus trachurus"          "Triglidae"                   
[79] "Trisopterus esmarkii"         "Unidentified"                
# FIXME: check Physidae, Phycidae
d <- d %>% 
  filter(!species %in% c("Unidentified", "Sprattus sprattus adult", "Clupea harengus adult"))

d %>% 
  group_by(year, species) %>% 
  summarise(n = n()) %>% 
  ggplot(aes(year, n, fill = species)) + 
  guides(fill = "none") + 
  facet_wrap(~species, scales = "free_y") + 
  geom_bar(stat = "identity")

# Filter species with at least 10 years of data
d <- d %>% 
  group_by(species, year) %>% 
  # At least 3 samples by year
  mutate(n = n()) %>% 
  ungroup() %>% 
  filter(n >= 3) %>% 
  group_by(species) %>% 
  mutate(n_years = length(unique(year))) %>% 
  ungroup() %>% 
  filter(n_years >= 10) %>% 
  dplyr::select(-n_years, -n)

d %>% 
  group_by(year, species) %>% 
  summarise(n = n()) %>% 
  ggplot(aes(year, n, fill = species)) + 
  guides(fill = "none") + 
  facet_wrap(~species, scales = "free_y", ncol = 5) + 
  geom_bar(stat = "identity") + 
  scale_fill_viridis(discrete = TRUE)

# Plot species samples in space, color by year
plot_map_fc + 
  geom_point(data = d,
             aes(X*1000, Y*1000, color = year),
             size = 0.5) + 
  facet_wrap(~species, ncol = 6) + 
  scale_color_viridis() + 
  ggtitle("Samples in space by species and year")

# Fixme: Crystallugobius not a larvate, right?

Add temperature covariate to hauls

# https://data.marine.copernicus.eu/product/NWSHELF_MULTIYEAR_PHY_004_009/download?dataset=cmems_mod_nws_phy-t_my_7km-3D_P1M-m_202012

# Uhhhh this is annoying. The extent of either the NS or BS do not cover the whole data. For now, I'm using both and splitting the data... 

print(nc_open(paste0(home, "/data/copernicus/cmems_mod_nws_phy-t_my_7km-3D_P1M-m_1711622425797.nc")))
File /Users/maxlindmark/Dropbox/Max work/R/larval-sizes/data/copernicus/cmems_mod_nws_phy-t_my_7km-3D_P1M-m_1711622425797.nc (NC_FORMAT_NETCDF4):

     1 variables (excluding dimension variables):
        float thetao[longitude,latitude,depth,time]   (Contiguous storage)  
            units: degrees_C
            _FillValue: NaN
            standard_name: sea_water_potential_temperature
            long_name: Sea Water Potential Temperature

     4 dimensions:
        depth  Size:1 
            standard_name: depth
            long_name: Depth
            units: m
            unit_long: Meters
            axis: Z
            positive: down
            valid_min: 0
            valid_max: 5000
        latitude  Size:104 
            standard_name: latitude
            long_name: Latitude
            units: degrees_north
            unit_long: Degrees North
            axis: Y
            valid_min: 40.0666694641113
            valid_max: 65.0012512207031
        longitude  Size:53 
            standard_name: longitude
            long_name: Longitude
            units: degrees_east
            unit_long: Degrees East
            axis: X
        time  Size:368 
            standard_name: time
            long_name: Time
            units: seconds since 1970-01-01 00:00:00
            calendar: gregorian
            axis: T

    12 global attributes:
        Conventions: CF-1.11
        title: monthly-mean potential temperature (3D)
        institution: UK Met Office
        source: AMM-FOAM 7 km (tidal) NEMO v3.6_FABM-ERSEM v15.06_NEMOVAR v6
        history: See source and creation_date attributes
        credit: E.U. Copernicus Marine Service Information (CMEMS)
        contact: servicedesk.cmems@mercator-ocean.eu
        references: http://marine.copernicus.eu/
        subset:source: ARCO data downloaded from the Marine Data Store using the MyOcean Data Portal
        subset:productId: NWSHELF_MULTIYEAR_PHY_004_009
        subset:datasetId: cmems_mod_nws_phy-t_my_7km-3D_P1M-m_202012
        subset:date: 2024-03-28T10:40:25.797Z
temp_tibble_ns <- tidync(paste0(home, "/data/copernicus/cmems_mod_nws_phy-t_my_7km-3D_P1M-m_1711622425797.nc")) %>%
  hyper_tibble() %>% 
  mutate(date = as_datetime(time, origin = '1970-01-01')) %>% 
  mutate(month = month(date),
         day = day(date),
         year = year(date)) %>% 
  # Filter January
  filter(month == 1) %>% 
  rename(sst = thetao) %>% 
  add_utm_columns()

# plot_map + 
#   geom_point(data = temp_tibble, aes(X*1000, Y*1000))

# Now do the baltic
print(nc_open(paste0(home, "/data/copernicus/cmems_mod_bal_phy_my_P1M-m_1711622828562.nc")))
File /Users/maxlindmark/Dropbox/Max work/R/larval-sizes/data/copernicus/cmems_mod_bal_phy_my_P1M-m_1711622828562.nc (NC_FORMAT_NETCDF4):

     1 variables (excluding dimension variables):
        float thetao[longitude,latitude,depth,time]   (Contiguous storage)  
            units: degrees_C
            _FillValue: NaN
            standard_name: sea_water_potential_temperature
            long_name: Potential temperature

     4 dimensions:
        depth  Size:1 
            standard_name: depth
            long_name: Depth
            units: m
            unit_long: Meters
            axis: Z
            positive: down
            valid_min: 0.501646220684052
            valid_max: 712.024658203125
        latitude  Size:398 
            standard_name: latitude
            long_name: Latitude
            units: degrees_north
            unit_long: Degrees North
            axis: Y
            valid_min: 53.0082969665527
            valid_max: 65.8909912109375
        longitude  Size:183 
            standard_name: longitude
            long_name: Longitude
            units: degrees_east
            unit_long: Degrees East
            axis: X
        time  Size:348 
            standard_name: time
            long_name: Time
            units: seconds since 1970-01-01 00:00:00
            calendar: gregorian
            axis: T

    11 global attributes:
        Conventions: CF-1.11
        title: CMEMS NEMO monthly integrated model fields
        institution: Baltic MFC, PU Danish Meteorological Institute
        source: CMEMS BAL MFC NEMO model output converted to NetCDF
        contact: servicedesk.cmems@mercator-ocean.eu
        references: https://marine.copernicus.eu/
        comment: Data on cropped native product grid. Horizontal velocities destagged
        subset:source: ARCO data downloaded from the Marine Data Store using the MyOcean Data Portal
        subset:productId: BALTICSEA_MULTIYEAR_PHY_003_011
        subset:datasetId: cmems_mod_bal_phy_my_P1M-m_202303
        subset:date: 2024-03-28T10:47:08.563Z
# https://data.marine.copernicus.eu/product/BALTICSEA_MULTIYEAR_PHY_003_011/download?dataset=cmems_mod_bal_phy_my_P1M-m_202303
temp_tibble_bs <- tidync(paste0(home, "/data/copernicus/cmems_mod_bal_phy_my_P1M-m_1711622828562.nc")) %>%
  hyper_tibble() %>% 
  mutate(date = as_datetime(time, origin = '1970-01-01')) %>% 
  mutate(month = month(date),
         day = day(date),
         year = year(date)) %>% 
  # Filter January
  filter(month == 1) %>% 
  rename(sst = thetao) %>% 
  add_utm_columns()

# plot_map + 
#   geom_point(data = temp_tibble_bs, aes(X*1000, Y*1000))

# Combine temperature tibbles

temp_tibble <- bind_rows(temp_tibble_ns %>%
                           filter(longitude <= min(temp_tibble_bs$longitude)), 
                         temp_tibble_bs)

# Loop through all month-year combos, extract the temperatures at the data locations

# Loop, make raster and extract
temp_list <- list()

# Trim years we have temperature for (again, annoying! Fix the temperatures later)
d <- d %>% 
  filter(year >= 1993 & year <= 2021)

for(i in unique(d$year)) {
  
  d_sub <- filter(d, year == i)
  temp_tibble_sub_bs <- filter(temp_tibble_bs, year == i)
  temp_tibble_sub_ns <- filter(temp_tibble_ns, year == i)
  
  # Convert to raster
  temp_raster_bs <- as_spatraster(temp_tibble_sub_bs, xycols = 2:3,
                                  crs = "WGS84", digits = 3)

  temp_raster_ns <- as_spatraster(temp_tibble_sub_ns, xycols = 2:3,
                                  crs = "WGS84", digits = 3)
    
  # p1 <- ggplot() + 
  #   geom_spatraster(data = temp_raster_bs$sst, aes(fill = sst))
  # 
  # p2 <- ggplot() + 
  #   geom_spatraster(data = temp_raster_ns$sst, aes(fill = sst))
  # 
  # p1 + p2
  
  # Extract from raster
  d_bs <- d_sub %>% 
    filter(lon >= min(temp_tibble_sub_bs$longitude))
  
  d_ns <- d_sub %>% 
    filter(lon < min(temp_tibble_sub_bs$longitude))
  
  d_bs$temp <- terra::extract(temp_raster_bs$sst,
                              d_bs %>% dplyr::select(lon, lat))$sst  
    
  d_ns$temp <- terra::extract(temp_raster_ns$sst,
                              d_ns %>% dplyr::select(lon, lat))$sst  
    
  d_comb <- bind_rows(d_ns, d_bs)
  
  # Save
  temp_list[[i]] <- d_comb
  
}

d <- bind_rows(temp_list)

Plot response variables

d %>% 
  summarise(n = n(), .by = species) %>% 
  arrange(desc(n))
# A tibble: 17 × 2
   species                      n
   <chr>                    <int>
 1 Crystallogobius linearis 11054
 2 Clupea harengus          10164
 3 Aphia minuta              5800
 4 Pholis gunnellus          4169
 5 Pomatoschistus sp         1930
 6 Ammodytidae               1221
 7 Syngnathus rostellatus    1114
 8 Chirolophis  ascanii      1075
 9 Sprattus sprattus          928
10 Microstomus kitt           610
11 Myoxocephalus scorpius     508
12 Agonus cataphractus        379
13 Anguilla anguilla          268
14 Limanda limanda            255
15 Argentina spp               76
16 Enchelyopus cimbrius        68
17 Maurolicus muelleri         55
# Distribution of data
ggplot(d, aes(length_mm)) + 
  geom_histogram() + 
  facet_wrap(~species, scales = "free")

# Effect of day of the year
ggplot(d, aes(yday, length_mm)) + 
  geom_point(size = 0.4, alpha = 0.4) +  
  geom_smooth(method = "lm") +
  facet_wrap(~species, scales = "free")

# Effect of year
ggplot(d, aes(year, length_mm)) + 
  geom_point(size = 0.4, alpha = 0.4) +  
  geom_smooth(method = "lm") +
  facet_wrap(~species, scales = "free")

# Effect of temperature
ggplot(d, aes(temp, length_mm)) + 
  geom_point(size = 0.4, alpha = 0.4) +  
  geom_smooth(method = "lm") +
  #geom_smooth() +
  facet_wrap(~species, scales = "free")

Save data

write_csv(d, paste0(home, "/data/clean/larval_size.csv"))